library(LexisNexisTools)
library(dplyr)
library(ggplot2)
library(stringr)

setwd("/Users/stephenchaudoin/Dropbox/Kill_Scrapes/IO Reviews/Kenya news/unidocs")

remove(list=ls())

test <- lnt_read(c("Files (100).DOCX",
			"Files (100)(1).DOCX",
			"Files (100)(2).DOCX",
			"Files (100)(3).DOCX",
			"Files (100)(4).DOCX",
			"Files (100)(5).DOCX",
			"Files (100)(6).DOCX",
			"Files (100)(7).DOCX",
			"Files (100)(8).DOCX",
			"Files (100)(9).DOCX",
			"Files (100)(10).DOCX",
			"Files (100)(11).DOCX",
			"Files (100)(12).DOCX",
			"Files (100)(13).DOCX",
			"Files (100)(14).DOCX",
			"Files (100)(15).DOCX",
			"Files (100)(16).DOCX",
			"Files (100)(17).DOCX",
			"Files (100)(18).DOCX",
			"Files (100)(19).DOCX",
			"Files (100)(20).DOCX",
			"Files (100)(21).DOCX",
			"Files (100)(22).DOCX",
			"Files (100)(23).DOCX",
			"Files (100)(24).DOCX",
			"Files (100)(25).DOCX",
			"Files (100)(26).DOCX",
			"Files (100)(27).DOCX",
			"Files (100)(28).DOCX",
			"Files (100)(29).DOCX",
			"Files (100)(30).DOCX",
			"Files (100)(31).DOCX", "Files (100)(32).DOCX", "Files (100)(33).DOCX", "Files (100)(34).DOCX", "Files (100)(35).DOCX", "Files (100)(36).DOCX", "Files (100)(37).DOCX",
			"Files (100)(38).DOCX", "Files (100)(39).DOCX", "Files (100)(40).DOCX", "Files (100)(41).DOCX","Files (100)(41b).DOCX","Files (26).DOCX"),
			extract_paragraphs = FALSE)


#******# Need to pause here to choose English for the dates, option 2
#	This does create an annoying gap in coverage because Lexis has a bunch of articles with dates like "March 201

articles <- merge(test@meta,test@articles,by=c("ID")) 
# Removing some duplicates that came from the searching process (had to do overlapping searches because of DL limits)
articles <- articles %>% distinct(Article, .keep_all = TRUE)

# Combining some common tokens
articles$Article <- str_replace_all(articles$Article,regex("human rights watch", ignore_case = TRUE),"humanrightswatch")					# Tidying HRW
articles$Article <- str_replace_all(articles$Article,regex("hrw", ignore_case = TRUE),"humanrightswatch")
articles$Article <- str_replace_all(articles$Article,regex("human rights", ignore_case = TRUE),"humanrights")							# Tidying human rights
articles$Article <- str_replace_all(articles$Article,regex("post-election violence", ignore_case = TRUE),"postelectionviolence")			# Tidying the ways that PEV is used
articles$Article <- str_replace_all(articles$Article,regex("post-electoral violence", ignore_case = TRUE),"postelectionviolence")
articles$Article <- str_replace_all(articles$Article,regex("post electoral violence", ignore_case = TRUE),"postelectionviolence")
articles$Article <- str_replace_all(articles$Article,regex("post election violence", ignore_case = TRUE),"postelectionviolence")
articles$Article <- str_replace_all(articles$Article,regex("\\sPEV", ignore_case = TRUE),"postelectionviolence")
articles$Article <- str_replace_all(articles$Article,regex("postelectionviolence (PEV)", ignore_case = TRUE),"postelectionviolence")
articles$Article <- str_replace_all(articles$Article,regex("International Criminal Court", ignore_case = TRUE),"ICC")					# Tidying the way that ICC is used
articles$Article <- str_replace_all(articles$Article,regex("crimes-against-humanity", ignore_case = TRUE),"crimesagainsthumanity")		# "against" gets removed as a stop word


# Useful for looking for articles with a specific term
#articles[ which(regexpr("post-election violence",articles$Article, ignore.case = TRUE) != -1), ][1:3,]
#articles[ which(regexpr("\\sPEV",articles$Article, ignore.case = FALSE) != -1), ][1:10,]


# Indicators for whether an article has certain terms
articles$hasICC <- ifelse(regexpr("bensouda", articles$Article, ignore.case = TRUE) != -1 | regexpr("international criminal court", articles$Article, ignore.case = TRUE) != -1 
	| regexpr("ICC", articles$Article, ignore.case = TRUE) != -1 |  regexpr("Ocampo", articles$Article, ignore.case = TRUE) != -1 , 
	1 , 0)

articles$hasHR <- ifelse(regexpr("humanrights", articles$Article, ignore.case = TRUE) != -1, 
	1 , 0)

articles$kenyattasays <- ifelse(regexpr("kenyatta says", articles$Article, ignore.case = TRUE) != -1 | regexpr("kenyatta said", articles$Article, ignore.case = TRUE) != -1 , 
	1 , 0)


articles$hasICCHR <- ifelse(articles$hasICC == 1 & articles$hasHR == 1, 1 , 0)
articles$hasHRnoICC <- ifelse(articles$hasICC != 1 & articles$hasHR == 1, 1 , 0)
	
articlesbyday <- articles %>% filter(!is.na(Date)) %>% group_by(Date) %>% dplyr::mutate(hasICCtotal=sum(hasICC), hasHRtotal=sum(hasHR), hasICCHRtotal=sum(hasICCHR), hasHRnoICCtotal=sum(hasHRnoICC)) %>% distinct(Date, .keep_all = TRUE)
#articlesbyday <- as.data.frame(articlesbyday)

articlesbyday[which(articlesbyday$hasICCtotal == 87),]		# this was an issue arising because Date = NA had a bunch of articles
articlesbyday[which(articlesbyday$hasICCtotal == 19),]		# Date: 2012-01-25

#write.csv(articlesbyday, "/Users/stephenchaudoin/Dropbox/Kill_Scrapes/replication code for IO/kenyaarticlesbyday.csv")

# ggplot(articlesbyday, aes(x=Date, y= hasICCtotal)) +
#   geom_line(, color = "red") +
#   geom_line(aes(x=Date, y= hasICCHRtotal), color = "blue")


#############
# STM
#############
library(MASS)
library(Hmisc)
library(stringr)
library(stm)
library(tidyr)
library(fastDummies)
library(stringr)
library(reshape2)
library(lessR)
library(dplyr)
library(ggplot2)

# STM pre-processing
set.seed(1234)
dfull.processed <- textProcessor(articles$Article, metadata = articles)
full.out <- prepDocuments(dfull.processed$documents, dfull.processed$vocab, dfull.processed$meta)

#	Basic STM with K set to 100, PEV corpus
set.seed(1234)
kenyanews.full.k100 <- stm(documents = full.out$documents, vocab = full.out$vocab, K = 100, max.em.its = 175, data = full.out$meta, init.type = "Spectral")
lt.kenyanews.full.k100 <- labelTopics(kenyanews.full.k100, n = 20, frexweight = 0.5)
lt.kenyanews.full.k100
dt.kenyanews.full.k100 <- make.dt(kenyanews.full.k100, meta = articles)

#	Basic STM with K set to 0, PEV corpus
set.seed(1234)
kenyanews.full.k0 <- stm(documents = full.out$documents, vocab = full.out$vocab, K = 0, max.em.its = 175, data = full.out$meta, init.type = "Spectral")
lt.kenyanews.full.k0 <- labelTopics(kenyanews.full.k0, n = 20, frexweight = 0.5)
lt.kenyanews.full.k0
dt.kenyanews.full.k0 <- make.dt(kenyanews.full.k0, meta = articles)


###
# Finding articles with words, diagnostics
###
#articles[ which(regexpr("crimes--human",articles$Article, ignore.case = TRUE) != -1), ][1:3,]

thoughttest <- findThoughts(kenyanews.full.k0, texts=articles$Article, topics=c(3,17,63), n=10)
thoughttest

###
# Types of topics, prevalence by day
###

### Using the k.0 model
# ICC Topics: 6,9,12,13,14,21,25,31,47,53,57,58,60,62,68,72,83,84,97 NOT including 17 
# HR topics: 3, 17, 63
dt.kenyanews.full.k0$icc <- dt.kenyanews.full.k0$Topic6 + dt.kenyanews.full.k0$Topic9 + dt.kenyanews.full.k0$Topic12 + dt.kenyanews.full.k0$Topic13 + dt.kenyanews.full.k0$Topic14 + dt.kenyanews.full.k0$Topic21 + 
							dt.kenyanews.full.k0$Topic25 + dt.kenyanews.full.k0$Topic31 + dt.kenyanews.full.k0$Topic47 + dt.kenyanews.full.k0$Topic53 + dt.kenyanews.full.k0$Topic58 +
							dt.kenyanews.full.k0$Topic60 + dt.kenyanews.full.k0$Topic62 + dt.kenyanews.full.k0$Topic72 + dt.kenyanews.full.k0$Topic83 + dt.kenyanews.full.k0$Topic84 + 
							dt.kenyanews.full.k0$Topic97 		# + dt.kenyanews.full.k0$Topic17
dt.kenyanews.full.k0$hrtopics <- dt.kenyanews.full.k0$Topic3 + dt.kenyanews.full.k0$Topic17 + dt.kenyanews.full.k0$Topic63

# Sum and mean of coverage by day, full k.300
dt.kenyanews.full.k0.sumbyday <- dt.kenyanews.full.k0 %>% filter(!is.na(Date) & Date >= "2010-01-01") %>% 
	group_by(Date) %>%
	summarise_at(vars(c(Topic1:Topic98,icc,hrtopics)), sum)

dt.kenyanews.full.k0.meanbyday <- dt.kenyanews.full.k0 %>% filter(!is.na(Date) & Date >= "2010-01-01") %>% 
	group_by(Date) %>%
	summarise_at(vars(c(Topic1:Topic98,icc,hrtopics)), mean)
	

#############################################################
library(gdata)
keep(dt.kenyanews.full.k0.sumbyday,dt.kenyanews.full.k0.meanbyday, sure = TRUE)
save.image("kenya data for plots.RData")
#############################################################

#############################################################
# Start here for reproducing the appendix figure
#############################################################

load("kenya data for plots.RData")

## Figure J.1
# Plot with loess lines, mean by day; has to be split into two parts for each curve because of the missing data in mid 2012
testplot <- ggplot() +
  geom_point(data = dt.kenyanews.full.k0.meanbyday, aes(x = Date, y = icc, col = "ICC Topics"), size = 0.5)  +
  geom_point(data = dt.kenyanews.full.k0.meanbyday, aes(x = Date, y = hrtopics, col = "HR Topics"), size = 0.5)  +
  geom_smooth(data = subset(dt.kenyanews.full.k0.meanbyday, Date <= "2012-03-27"), aes(x = Date, y = icc, col = "ICC Topics"), span = 0.2) + 
  geom_smooth(data = subset(dt.kenyanews.full.k0.meanbyday, Date <= "2012-03-27"), aes(x = Date, y = hrtopics, col = "HR Topics"), span = 0.2) + 
  geom_smooth(data = subset(dt.kenyanews.full.k0.meanbyday, Date >= "2012-06-02"), aes(x = Date, y = icc, col = "ICC Topics"), span = 0.2) + 
  geom_smooth(data = subset(dt.kenyanews.full.k0.meanbyday, Date >= "2012-06-02"), aes(x = Date, y = hrtopics, col = "HR Topics"), span = 0.2) 
testplot <- testplot +
  geom_vline(xintercept = as.numeric(as.Date(c("2010-03-31"))), linetype=3) +			# Investigation begins
  annotate(geom="text",x=as.Date("2010-04-20"),y=0.85,label="ICC inv. begins") +
  geom_vline(xintercept = as.numeric(as.Date(c("2011-03-08"))), linetype=3) +			# Kenyatta et al summonses
  annotate(geom="text",x=as.Date("2011-03-08"),y=0.93,label="Summonses issued") +
  geom_vline(xintercept = as.numeric(as.Date(c("2012-01-23"))), linetype=3) +			# Kenyatta charges confirmed
  annotate(geom="text",x=as.Date("2012-02-23"),y=0.7,label="Kenyatta charges confirmed") +
  #    geom_vline(xintercept = as.numeric(as.Date(c("2013-09-10"))), linetype=3) +			# Ruto trial begins
  #	annotate(geom="text",x=as.Date("2013-04-23"),y=5,label="Ruto trial beg.") +
  xlab("Date") + ylab("Prevalence (mean)") + theme(legend.title = element_blank()) + ylim(0,1)
testplot


